#####################################################################################
##################      BIODIVERSITY REPLICATION - DATA GENERATION    ###############
#####################################################################################

#Code accompanying:
#Andrew Piper, "Biodiversity is not declining in fiction" Journal of Cultural Analytics (2022)

##### Make Metadata Table for PG Data #####

#Ingest list of books
setwd("~/Data")
id<-list.files("gut_BIO")
id<-gsub(".txt", "", id)

#Input meta data from Project Gutenberg
setwd("~/Research/biodiversity")
pg2<-read.csv("PG_Meta_All.csv")

#subset by matching
pg2.sub<-pg2[pg2$id %in% id,]

#subset by only books with author birth and death dates
pg2.sub<-pg2.sub[!is.na(pg2.sub$authoryearofbirth),]
pg2.sub<-pg2.sub[!is.na(pg2.sub$authoryearofdeath),]

#add estimated year of publication following Langer et al formula:
#0.5*(birth+21+death)
pg2.sub$pub.date<-round(0.5*(pg2.sub$authoryearofbirth + 21 + pg2.sub$authoryearofdeath))

#save
write.csv(pg2.sub, file="PG_metadata_bio.csv", row.names = F)

########## Sample Sentences for validating model ###############
#This takes 100 random 50 word passages beginning at a new sentence where we have at least 
#one plant/animal detection and outputs them as a single table for annotation.

#we validate on the Stanford + Chicago data using 3 separate coders

#For Stanford Dataset
setwd("~/Data/19C_Stanford_Novel_bookNLP")
val.files<-list.files()
#sample books
val.files<-sample(val.files, 16)
#create output table
val.df<-NULL
#for each book
for (i in 1:length(val.files)){
  #ingest book as bookNLP table
  a<-read.csv(val.files[i], sep="\t", quote = "", stringsAsFactors = F)
  #subset by all plant/animal tokens
  a1<-a[a$supersense %in% c("B-noun.plant", "B-noun.animal"),]
  #sample random sentence from this group
  sent<-sample(unique(a1$sentenceID),1)
  #get start token of the sentence
  start<-a$tokenId[a$sentenceID == sent]
  #get 50 words moving forward from beginning of this prior sentence
  a2<-a[(start[1]+1):((start[1]+1)+49),]
  #add filename column
  a2$fileID<-val.files[i]
  #store to larger table
  val.df<-rbind(val.df, a2)
}

#For Chicago Dataset
setwd("~/Data/CHICAGO_SAMPLE_bookNLP")
val.files<-list.files()
#sample books
val.files<-sample(val.files, 17)
#create output table
val.df2<-NULL
#for each book
for (i in 1:length(val.files)){
  #ingest book as bookNLP table
  a<-read.csv(val.files[i], sep="\t", quote = "", stringsAsFactors = F)
  #subset by all plant/animal tokens
  a1<-a[a$supersense %in% c("B-noun.plant", "B-noun.animal"),]
  #sample random sentence from this group
  sent<-sample(unique(a1$sentenceID),1)
  #get start token of the sentence
  start<-a$tokenId[a$sentenceID == sent]
  #get 50 words moving forward from beginning of this prior sentence
  a2<-a[(start[1]+1):((start[1]+1)+49),]
  #add filename
  a2$fileID<-val.files[i]
  #store to larger table
  val.df2<-rbind(val.df2, a2)
}

#combine tables
val.final<-rbind(val.df, val.df2)

#output original
setwd("~/Research/biodiversity")
write.csv(val.final, file="biodiversity_validation_01_All.csv", row.names = F)
#write.csv(val.final, file="biodiversity_validation_02_All.csv", row.names = F)
#write.csv(val.final, file="biodiversity_validation_03_All.csv", row.names = F)

#clean for validaters
sentence<-val.final[,which(colnames(val.final) %in% c("originalWord"))]
bio<-vector(mode="character", length=nrow(val.final))
notes<-vector(mode="character", length=nrow(val.final))
val.final2<-data.frame(sentence, bio, notes)
setwd("~/Research/biodiversity")
write.csv(val.final2, file="biodiversity_validation_01_EE.csv", row.names = F)
#write.csv(val.final2, file="biodiversity_validation_02_AZ.csv", row.names = F)
#write.csv(val.final2, file="biodiversity_validation_03_LF.csv", row.names = F)

##### Ingest coder validations and merge with original tables #####
setwd("~/Research/biodiversity")
a<-read.csv("Validation_Coder_01.csv")
b<-read.csv("Validation_Coder_02.csv")
c<-read.csv("Validation_Coder_03.csv")

#combine
all<-rbind(a,b,c)

#add to original data
a1<-read.csv("biodiversity_validation_01_All.csv")
b1<-read.csv("biodiversity_validation_02_All.csv")
c1<-read.csv("biodiversity_validation_03_All.csv")

#combine
all2<-rbind(a1,b1,c1)

#append annotations
all2<-cbind(all2, all)

#test for matching
test<-all2[which(all2$normalizedWord != all2$sentence),]

#save
write.csv(all2, "Validation_Bio.csv", row.names = F)


##########    GENERATE MEASURES + TAXA LISTS      ############
#This takes as input bookNLP .tokens tables
#It calculates the three measures:
#abundance
#richness
#diversity
#Using 2 different methods:
#DB = lemma matching
#ML = supervised machine learning
#It also generates list of taxa using the two methods

library(tm)
library(entropy)

#get taxa lists for DB method
setwd("~/Research/biodiversity")
db<-read.csv("TaxonDB_english.csv")
black<-read.csv("Blacklist.txt", header=F)
#remove blacklist terms
db<-db[!db$Term %in% black$V1,]


#### get filenames of .tokens table in a single directory

#### for Stanford+Chicago data

#load metadata
setwd("~/Data")
stan.m<-read.csv("19C_Stanford_Novel_750_Meta.csv")
chic.m<-read.csv("CHICAGO_CORPUS_NOVELS.csv")

#get filenames
setwd("~/Data/bio_StanChicago_bookNLP")
fileNames<-list.files()

#Subset Chicago Metadata
chic.m<-chic.m[chic.m$FILENAME %in% gsub(".tokens", "",fileNames),]

### for PG Data

#load metadata
setwd("~/Research/biodiversity")
pg.meta<-read.csv("PG_metadata_bio.csv")

#get filenames
#setwd("~/Data")
#fileNames<-list.files()

#empty final table for each method
final.df<-NULL

#empty output taxa lists for each method
taxa.ml<-vector()
taxa.db<-vector()
#taxa.db.nouns<-vector()

#for every book
for (i in 1:length(fileNames)){
  
  print(i)
  
  #ingest the i-th file
  a<-read.csv(fileNames[i], sep="\t", quote = "", stringsAsFactors = F)
  #remove punctuation to get word count
  a1<-a[-grep("[[:punct:]]", a$lemma),]
  
  
  #ABUNDANCE
  #divide total number of plant/animal tokens by total words
  
  ##### METHOD = ML #######
  
  plant<-length(grep("B-noun.plant", a$supersense))/nrow(a1)
  animal<-length(grep("B-noun.animal", a$supersense))/nrow(a1)
  #run after removing horses
  h.df<-a[a$lemma != "horse",]
  noHorse<-length(grep("B-noun.animal", h.df$supersense))/nrow(a1)
  
  #store all plant/animal phrases
  
  #first concatenate all multi-word entities into a single vector
  lemma.bi.v<-vector()
  multi.index<-a[grep("I-noun.plant|I-noun.animal", a$supersense),]
  if (nrow(multi.index) > 0){
    for (j in 1:nrow(multi.index)){
      lemma.bi<-paste(a$lemma[(multi.index$tokenId[j])], a$lemma[(multi.index$tokenId[j]+1)], sep = " ")
      lemma.bi.v<-append(lemma.bi.v, lemma.bi)
    }
  }
  
  #then remove these from the dataset
  remove<-append(multi.index$tokenId, multi.index$tokenId-1)
  a.sub<-a[!a$tokenId %in% remove,]
  
  #create vector of 1-grams
  ml.g<-a.sub$lemma[grep("noun.plant|noun.animal", a.sub$supersense)]
  
  #combine
  ml.g<-append(ml.g, lemma.bi.v)
  
  #store taxa into one large vector
  taxa.ml<-append(taxa.ml, ml.g)

  ##### METHOD = DB #######
  
  #keep only nouns + prior token
  a.noun1<-a$tokenId[a$pos %in% c("NN", "NNS")]
  a.noun2<-a.noun1-1
  a.noun.v<-sort(append(a.noun1, a.noun2))
  a.noun.v<-a.noun.v[a.noun.v > 0]
  a.noun<-a[a$tokenId %in% a.noun.v,]
  
  #create bigrams
  bi<-cbind(a.noun$lemma, c(a.noun$lemma[-1], a.noun$lemma[1]))
  bi<-paste(bi[,1], bi[,2], sep=" ")
  
  #match to database
  db.2g<-bi[bi %in% db$Term]
  
  #remove these words from unigram list
  if (length(db.2g) > 0){
    remove<-append(which(bi %in% db$Term), which(bi %in% db$Term)+1)
    a3<-a.noun[-remove,]
  } else {
    a3<-a.noun
  }
  
  #keep only Nouns
  a3<-a3[a3$pos %in% c("NN", "NNS"),]
  
  #subset by taxa - unigram
  db.1g<-a3$lemma[a3$lemma %in% db$Term]
  
  #get total frequency
  freq.db<-(length(db.1g)+length(db.2g))/nrow(a1)
  
  #create single vector to store
  db.allg<-append(db.2g, db.1g)
    
  #save to super list of lemmas
  taxa.db<-append(taxa.db, db.allg)

  
  ########  TAXA DIVERSITY + RICHNESS   ##########
  
  #take N samples of length K - default N = 10, K=1000
  N=10
  K=1000
  
  #create empty vectors
  ttr.v.ml<-vector("numeric", length=N)
  entropy.v.ml<-vector("numeric", length=N)
  ttr.v.db<-vector("numeric", length=N)
  entropy.v.db<-vector("numeric", length=N)

  #for N samples
  for (k in 1:N){
    
    #get random starting point
    start<-sample((nrow(a1)-1000), 1)
    #subset by sample of length K from random starting point
    sub<-a1[start:(start+999),]
    
    #######  METHOD = ML ##########
    
    #create 1 and 2 grams
    
    #check if there are bigrams
    if (length(grep("I-noun.plant|I-noun.animal", sub$supersense)) > 0){
      
      #first concatenate all multi-word entities into a single vector
      lemma.bi.v<-vector()
      multi.index<-sub[grep("I-noun.plant|I-noun.animal", sub$supersense),]
      if (nrow(multi.index) > 0){
        for (l in 1:nrow(multi.index)){
          lemma.bi<-paste(sub$lemma[sub$tokenId == (multi.index$tokenId[l]-1)], sub$lemma[sub$tokenId==(multi.index$tokenId[l])], sep = " ")
          lemma.bi.v<-append(lemma.bi.v, lemma.bi)
        }
      }
      
      #then remove these from the dataset
      remove<-append(multi.index$tokenId, multi.index$tokenId-1)
      sub2<-sub[!sub$tokenId %in% remove,]
      
      #create vector of 1-grams
      ml.g.samp<-sub2$lemma[grep("noun.plant|noun.animal", sub2$supersense)]
      
      #combine
      ml.g.samp<-append(ml.g.samp, lemma.bi.v)
      
      #RICHNESS
      #total number of lemmas per 1K words
      ttr.v.ml[k]<-length(ml.g.samp)
      
      #DIVERSITY
      #entropy of lemmas per 1K words
      entropy.v.ml[k]<-entropy(table(ml.g.samp))
      
    } else {
  
      #if there are no bigrams
      
      #RICHNESS
      #total number of unique lemmas per 1K words
      ttr.v.ml[k]<-length(unique(sub$lemma[grep("noun.plant|noun.animal", sub$supersense)]))
      
      #DIVERSITY
      #entropy of lemmas per 1K words
      entropy.v.ml[k]<-entropy(table(sub$lemma[grep("noun.plant|noun.animal", sub$supersense)]))
    }
    
    ###### METHOD = DB ###########
  
    #keep only nouns + prior token
    a.noun1<-sub$tokenId[sub$pos %in% c("NN", "NNS")]
    a.noun2<-a.noun1-1
    a.noun.v<-sort(append(a.noun1, a.noun2))
    a.noun.v<-a.noun.v[a.noun.v > 0]
    a.noun<-sub[sub$tokenId %in% a.noun.v,]
    
    #create bigrams
    bi<-cbind(a.noun$lemma, c(a.noun$lemma[-1], a.noun$lemma[1]))
    bi<-paste(bi[,1], bi[,2], sep=" ")
    
    #subset by taxa - bigrams
    db.2g.samp<-bi[bi %in% db$Term]
    
    #if there are bigrams
    if (length(db.2g.samp) > 0){
      #remove these words from unigram list
      remove<-append(which(bi %in% db$Term), which(bi %in% db$Term)+1)
      sub.remove<-a.noun[-remove,]
    } else {
      sub.remove<-a.noun
    }
    
    sub.remove<-sub.remove[sub.remove$pos %in% c("NN", "NNS"),]
    
    #subset by taxa - unigram
    db.1g.samp<-sub.remove$lemma[sub.remove$lemma %in% db$Term]
    
    #combine
    db.allg.samp<-append(db.1g.samp, db.2g.samp)
    
    #lexical richness == # types per frame
    ttr.v.db[k]<-length(unique(db.allg.samp))
    
    #alpha diversity
    entropy.v.db[k]<-entropy(table(db.allg.samp))
    
    }

  #calculate average of samples
  ttr.ml<-mean(ttr.v.ml, na.rm = T)
  entropy.ml<-mean(entropy.v.ml, na.rm = T)
  ttr.db<-mean(ttr.v.db, na.rm = T)
  entropy.db<-mean(entropy.v.db, na.rm = T)

  #save METADATA

  ### For Stanford+Chicago
  filename<-gsub(".tokens", "", fileNames[i])
  #if filename is in STAN
  if (filename %in% stan.m$FILENAME){
   year<-stan.m$DATE[stan.m$FILENAME == filename]
  }
  #if in CHIC
  if (filename %in% chic.m$FILENAME){
  year<-chic.m$PUBL_DATE[chic.m$FILENAME == filename]
  }
  
  ### For PG
  #filename<-gsub(".tokens", "", fileNames[i])
  #year<-pg.meta$pub.date[pg.meta$id == filename]
  
  #combine plants and animals in ML method
  freq.ml<-plant+animal
  
  #store in final data frame
  if (length(year) > 0){
    #store
    temp.df<-data.frame(filename, year, plant, animal, noHorse, freq.ml, freq.db, ttr.ml, ttr.db, entropy.ml, entropy.db)
    final.df<-rbind(final.df, temp.df)
  }
}

#save data
setwd("~/Research/biodiversity")
write.csv(final.df, file="Stan_Chicago_Results_New.csv", row.names = F)
write.csv(sort(table(taxa.db), decreasing = T), file="Stan_Chicago_Taxa_List_DB_New.csv", row.names = F)
write.csv(sort(table(taxa.ml), decreasing = T), file="Stan_Chicago_Taxa_List_ML_New.csv", row.names = F)




